Functions
# espresso_deseq2 <-
# read_tsv(
# paste0(wd, 'Tables/Espresso/espresso_deseq2_genetype2_isDET_2024-04-18.tsv')
# )
# espresso_deseq2
drs_cpm <-
read_tsv(
paste0(wd, 'Tables/DRS_quantification/espresso_quantification_cpm_2024-04-19.tsv.gz')
)
## Rows: 330453 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (14): transcript_id, transcript_name, gene_id, type, si, seqname, source...
## dbl (6): rep, count, total_reads, cpm, start, end
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
drs_cpm
## # A tibble: 330,453 × 20
## transcript_id transcript_name gene_id type si rep count total_reads
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 ENST00000498442.1 CRBN-212 ENSG00… siME… I 1 0 3552783
## 2 ENST00000498442.1 CRBN-212 ENSG00… siME… I 2 1 997879
## 3 ENST00000498442.1 CRBN-212 ENSG00… siME… I 3 0 2778705
## 4 ENST00000498442.1 CRBN-212 ENSG00… siME… G 1 0 3497396
## 5 ENST00000498442.1 CRBN-212 ENSG00… siME… G 2 0 3810844
## 6 ENST00000498442.1 CRBN-212 ENSG00… siME… G 3 0 3668094
## 7 ENST00000498442.1 CRBN-212 ENSG00… Cont D 1 1 2701773
## 8 ENST00000498442.1 CRBN-212 ENSG00… Cont D 2 1 3406597
## 9 ENST00000498442.1 CRBN-212 ENSG00… Cont D 3 0 3653792
## 10 ENST00000459840.5 CRBN-205 ENSG00… siME… I 1 1.08 3552783
## # ℹ 330,443 more rows
## # ℹ 12 more variables: cpm <dbl>, seqname <chr>, source <chr>, feature <chr>,
## # start <dbl>, end <dbl>, score <chr>, strand <chr>, frame <chr>,
## # gene_type <chr>, gene_name <chr>, transcript_type <chr>
common_intensity_up_positions <-
read_tsv(
paste0(wd, 'Tables/DRS/Positions/common_sig_seqs_in_intensity_up_2024-04-10.tsv.gz')
)
## Rows: 605 Columns: 65
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (30): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (35): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
common_intensity_up_positions
## # A tibble: 605 × 65
## transcript_id transcript_name position ref_kmer GMM_logit_pvalue_G
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ENST00000429711.7 RPL32-204 422 GCCCA 1
## 2 ENST00000647248.2 RPL35A-211 380 ACCCC 1
## 3 ENST00000647248.2 RPL35A-211 381 CCCCT 1
## 4 ENST00000389680.2 MT-RNR1-201 43 ACACA 1
## 5 ENST00000389680.2 MT-RNR1-201 57 CCCCG 1
## 6 ENST00000389680.2 MT-RNR1-201 71 GTTCA 1
## 7 ENST00000389680.2 MT-RNR1-201 73 TCACC 1
## 8 ENST00000389680.2 MT-RNR1-201 75 ACCCT 0.777
## 9 ENST00000389680.2 MT-RNR1-201 93 ATCAA 1
## 10 ENST00000389680.2 MT-RNR1-201 138 GCTTA 1
## # ℹ 595 more rows
## # ℹ 60 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## # GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## # Logit_LOR_G <dbl>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## # c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## # c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## # c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
num_m3C_sites <-
common_intensity_up_positions |>
filter(grepl('.{2}C.{2}', ref_kmer)) |>
group_by(transcript_id, transcript_name) |>
reframe(num_sites = n()) |>
ungroup()
num_m3C_sites
## # A tibble: 71 × 3
## transcript_id transcript_name num_sites
## <chr> <chr> <int>
## 1 ENST00000009589.8 RPS20-201 1
## 2 ENST00000199764.7 CEACAM6-201 1
## 3 ENST00000202773.14 RPL6-201 2
## 4 ENST00000215754.8 MIF-201 4
## 5 ENST00000229239.10 GAPDH-201 2
## 6 ENST00000230050.4 RPS12-201 4
## 7 ENST00000233143.6 TMSB10-201 15
## 8 ENST00000234875.9 RPL22-201 2
## 9 ENST00000243997.8 ATP5F1E-201 3
## 10 ENST00000254810.8 H3-3B-201 1
## # ℹ 61 more rows
expression_num_m3C_sites <-
drs_cpm |>
left_join(num_m3C_sites) |>
replace_na(list(num_sites = 0)) |>
select(transcript_id, cpm, num_sites, everything()) |>
mutate(log10_cpm_plus = log10(cpm + .01))
## Joining with `by = join_by(transcript_id, transcript_name)`
expression_num_m3C_sites
## # A tibble: 330,453 × 22
## transcript_id cpm num_sites transcript_name gene_id type si rep count
## <chr> <dbl> <int> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 ENST00000498… 0 0 CRBN-212 ENSG00… siME… I 1 0
## 2 ENST00000498… 1.00 0 CRBN-212 ENSG00… siME… I 2 1
## 3 ENST00000498… 0 0 CRBN-212 ENSG00… siME… I 3 0
## 4 ENST00000498… 0 0 CRBN-212 ENSG00… siME… G 1 0
## 5 ENST00000498… 0 0 CRBN-212 ENSG00… siME… G 2 0
## 6 ENST00000498… 0 0 CRBN-212 ENSG00… siME… G 3 0
## 7 ENST00000498… 0.370 0 CRBN-212 ENSG00… Cont D 1 1
## 8 ENST00000498… 0.294 0 CRBN-212 ENSG00… Cont D 2 1
## 9 ENST00000498… 0 0 CRBN-212 ENSG00… Cont D 3 0
## 10 ENST00000459… 0.304 0 CRBN-205 ENSG00… siME… I 1 1.08
## # ℹ 330,443 more rows
## # ℹ 13 more variables: total_reads <dbl>, seqname <chr>, source <chr>,
## # feature <chr>, start <dbl>, end <dbl>, score <chr>, strand <chr>,
## # frame <chr>, gene_type <chr>, gene_name <chr>, transcript_type <chr>,
## # log10_cpm_plus <dbl>
correlation_expression_num_m3C_sites <-
expression_num_m3C_sites |>
ggplot(aes(x = cpm, y = num_sites)) +
geom_hex() +
scale_x_log10() +
scale_fill_viridis_c(trans = 'log10') +
labs(x = 'log10(CPM + .1)', y = 'the number of m3C sites')
correlation_expression_num_m3C_sites |>
ggsave_multiple_formats(
width = 5, height = 5, fontsize = 7, outdir = figdir
)
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 145933 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 145933 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 145933 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 145933 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 145933 rows containing non-finite outside the scale range
## (`stat_binhex()`).
